Methylome Atlas of Acute Leukemia#
Load data#
Show code cell source
import pandas as pd
from source.pacmap_functions import *
input_path = '../Data/Intermediate_Files/'
output_path = '../Data/Processed_Data/'
# read df_discovery and df_validation
df_discovery = pd.read_pickle(
input_path+'df_discovery.pkl').sort_index()
df_validation = pd.read_pickle(
input_path+'df_validation.pkl').sort_index()
# Load clinical data
discovery_clinical_data = pd.read_csv(input_path+'discovery_clinical_data.csv',
low_memory=False, index_col=0)
# Load clinical data
validation_clinical_data = pd.read_csv(input_path+'validation_clinical_data.csv',
low_memory=False, index_col=0)
# Adjust clinical data
discovery_clinical_data['Train Test'] = 'Discovery (train) Samples'
validation_clinical_data['Train Test'] = 'Validation (test) Samples'
discovery_clinical_data['PaCMAP Output'] = 'Patient Samples'
validation_clinical_data['PaCMAP Output'] = 'Patient Samples'
discovery_clinical_data['Batch'] = df_discovery['Batch']
validation_clinical_data['Batch'] = 'St Jude Children\'s'
# use overlapping features between df_discovery and df_validation
common_features = [x for x in df_discovery.columns if x in df_validation.columns]
# apply `common_features` to both df_discovery and df_validation
df_discovery = df_discovery[common_features]
df_validation = df_validation[common_features]
print(
f' Discovery dataset (df_discovery) contains {df_discovery.shape[1]} \
columns (5mC nucleotides/probes) and {df_discovery.shape[0]} rows (samples).')
print(
f' Validation dataset (df_validation) contains {df_validation.shape[1]} \
columns (5mC nucleotides/probes) and {df_validation.shape[0]} rows (samples).')
output_notebook()
# Set the theme for the plot
curdoc().theme = 'light_minimal' # or 'dark_minimal'
The Methylome Atlas of Acute Leukemia#
Show code cell source
clinical_trials = ['NOPHO ALL92-2000',
'AAML0531',
'AAML1031',
'Beat AML Consortium',
'TCGA AML',
'CETLAM SMD-09 (MDS-tAML)',
'French GRAALL 2003–2005',
'TARGET ALL',
'AAML03P1',
'Japanese AML05',
'CCG2961']
sample_types = ['Diagnosis', 'Primary Blood Derived Cancer - Bone Marrow',
'Bone Marrow Normal','Primary Blood Derived Cancer - Peripheral Blood',
'Blood Derived Normal','Likely Diagnosis', 'Control (Healthy Donor)',
'Relapse','Recurrent Blood Derived Cancer - Bone Marrow',
'Recurrent Blood Derived Cancer - Peripheral Blood',
'Peripheral Blood Normal']
cols = ['Clinical Trial', 'Sample Type', 'Patient_ID', 'ELN AML 2022 Diagnosis', 'Train Test', 'Batch', 'Hematopoietic Lineage']
# components = [2,5]
# for n in components:
# processor = DataProcessor(discovery_clinical_data.copy(),
# df_discovery,
# clinical_trials,
# sample_types,
# cols,
# n_components=n,
# common_prefix=output_path+f'pacmap_output/pacmap_{n}d_model_acute_leukemia',
# df_test=df_validation.copy(),
# test_clinical_data=validation_clinical_data.copy())
# processor.filter_data()
# processor.apply_pacmap() # learn PaCMAP on the training data
# processor.apply_pacmap_test() # apply PaCMAP to the test data
# processor.join_labels()
# # Save output
# processor.df.to_csv(output_path+f'pacmap_output/pacmap_{n}d_output_acute_leukemia.csv')
df = pd.read_csv(output_path+'pacmap_output/pacmap_2d_output_acute_leukemia.csv', index_col=0)
Show code cell source
# Concatenate discovery and validation clinical data
clinical_data = pd.concat([discovery_clinical_data, validation_clinical_data]).loc[df['index']]
# Select columns to plot
cols = ['PaCMAP Output','Hematopoietic Lineage','WHO 2022 Diagnosis','WHO AML 2022 Diagnosis',
'WHO ALL 2022 Diagnosis','ELN AML 2022 Diagnosis','Age (group years)', 'Batch', 'Sex',
'Clinical Trial', 'Sample Type', 'Train Test']
# Join clinical data to the embedding
df = df.join(clinical_data[cols], rsuffix='_copy', on='index')
# Call the BokehPlotter class to plot the data
plotter = BokehPlotter(df, cols, get_custom_color_palette(),
title='The Methylome Atlas of Acute Leukemia',
x_range=(-40, 40), y_range=(-50, 50),
datapoint_size=3)
plotter.plot()